# In scikit-mobility, a set of trajectories is described by a TrajDataFrame,
# an extension of the pandas DataFrame that has specific columns names and data types.
# A TrajDataFrame can contain many trajectories, and each row in the TrajDataFrame represents
# a point of a trajectory, described by three mandatory fields (aka columns):
# 1. latitude (type: float);
# 2. longitude (type: float);
# 3. datetime (type: date-time).
## Additionally, two optional columns can be specified:
# •uid (type: string) identifies the object associated with the point of the trajectory.
# If uid is not present, scikit-mobility assumes that the TrajDataFrame contains trajectories
# associated with a single moving object;
# •tid specifies the identifier of the trajectory to which the point belongs to.
# If tid is not present, scikit-mobility assumes that all rows in the TrajDataFrame
# associated with a uid belong to the same trajectory;
import skmob
# create a TrajDataFrame from a list
data_list = [[1, 39.984094, 116.319236, '2008-10-23 13:53:05'], [1, 39.984198, 116.319322, '2008-10-23 13:53:06'], [1, 39.984224, 116.319402, '2008-10-23 13:53:11'], [1, 39.984211, 116.319389, '2008-10-23 13:53:16']]
tdf = skmob.TrajDataFrame(data_list, latitude=1, longitude=2, datetime=3)
# print a portion of the TrajDataFrame
print(tdf.head())
print(type(tdf))
## Now, Creating the TrajDataFrame from a pandas dataframe
import pandas as pd
# create a DataFrame from the previous list
data_df = pd.DataFrame(data_list, columns=['user', 'latitude', 'lng', 'hour'])
tdf = skmob.TrajDataFrame(data_df, latitude='latitude', datetime='hour', user_id='user')
# print the type of the object
print(type(tdf))
>>> # print the TrajDataFrame
>>> print(tdf)
# We can also create a TrajDataFrame from a file. For example, in the following
# we create a TrajDataFrame from a portion of a GPS trajectory dataset collected in the context of
# the GeoLife project by 178 users in a period of
# over four years from April 2007 to October 2011.
# Now, Creating a TrajDataFrame from a file
# download the file from https://raw.githubusercontent.com/scikit-mobility/scikit-mobility/master/tutorial/data/geolife_sample.txt.gz
# read the trajectory data (GeoLife, Beijing, China)
tdf = skmob.TrajDataFrame.from_file('C://Users//Snigdha.Cheekoty//Downloads//geolife_sample.txt.gz', latitude='lat', longitude='lon', user_id='user', datetime='datetime')
# print the TrajDataFrame
print(tdf)
tdf.plot_trajectory(zoom=12, weight=3, opacity=0.9, tiles='Stamen Toner')
# In scikit-mobility, an origin-destination matrix is described by the FlowDataFrame structure,
# an extension of the pandas DataFrame that has specific column names and data types. A row in a FlowDataFrame represents a flow of objects between two locations, described by three mandatory columns:
# 1. origin (type: string);
# 2. destination (type: string);
# 3. flow (type: integer).
# Each FlowDataFrame is associated with a spatial tessellation,
# a geopandas GeoDataFrame that contains two mandatory columns:
# 1. tile_ID (type: integer) indicates the identifier of a location;
# 2. geometry indicates the polygon (or point) that describes the geometric shape of the location
# on a territory (e.g., a square, a voronoi shape, the shape of a neighborhood)
# Note that each location identifier in the origin and destination columns of a FlowDataFrame
# must be present in the associated spatial tessellation.
# Create a spatial tessellation from a file describing counties in New York state:
import skmob
import geopandas as gpd
# load a spatial tessellation
url_tess = 'https://raw.githubusercontent.com/scikit-mobility/scikit-mobility/master/tutorial/data/NY_counties_2011.geojson'
tessellation = gpd.read_file(url_tess).rename(columns={'tile_id': 'tile_ID'})
# print a portion of the spatial tessellation
print(tessellation.head())
# CREATE A FLOW DATAFRAME
# .......from a spatial tesselation and a file of real flows between counties in NY State
# load real flows into a FlowDataFrame
# download the file with the real fluxes from: https://raw.githubusercontent.com/scikit-mobility/scikit-mobility/master/tutorial/data/NY_commuting_flows_2011.csv
fdf = skmob.FlowDataFrame.from_file("C://Users//Snigdha.Cheekoty//Downloads//NY_commuting_flows_2011.csv", tessellation=tessellation, tile_id='tile_ID',
sep=",")
# print a portion of the flows
print(fdf.head(10))
# A FlowDataFrame can be visualized on a folium interactive map using the plot_flows function,
# which plots the flows on a geographic map as lines between the centroids of the tiles
# in the FlowDataFrame's spatial tessellation:
fdf.plot_flows(flow_color='red')
# Spacial Tessellation of FlowDataFrame can be visualized using plot_tesselation function
fdf.plot_tessellation(popup_features=['tile_ID', 'population'])
# Visualizing spatial tessellation and the flows together
m = fdf.plot_tessellation() # plot the tessellation
fdf.plot_flows(flow_color='red', map_f=m) # plot the flows
## The preprocessing needed for mobility data analysis:
#1. Noise filtering
#2. Stop detection
#3. Stop Clustering
#4. Trajectory Compression
# Note that, if a TrajDataFrame contains multiple trajectories from multiple users,
# the preprocessing methods automatically apply to the single trajectory and,
# when necessary, to the single moving object.
# ********************* Noise filtering ******************************************************
# Filter ot a point if:
# if the speed from the previous point is higher than the parameter max_speed,
# which is by default set to 500km/h
from skmob.preprocessing import filtering
# filter out all points with a speed (in km/h) from the previous point higher than 500 km/h
ftdf = filtering.filter(tdf, max_speed_kmh=500.)
print(ftdf.parameters)
n_deleted_points = len(tdf) - len(ftdf) # number of deleted points
print(n_deleted_points)
# ********************************* Stop detection ************************************************
# Some points in the trajectory can represent POINTS OF INTEREST (POI) such as schools, bars, restuarants,
# and also user-specific loactions like home and work locations
# These POIs are also called STOPS and can be detected in different ways
# Common approach is to apply spatial clustering algorithms to cluster trajectory points by looking at theor spatial proximity
from skmob.preprocessing import detection
# compute the stops for each individual in the TrajDataFrame
# Identifying the stops where the object spent atleast certain minutes within a certain distance
stdf = detection.stops(tdf, stop_radius_factor=0.5, minutes_for_a_stop=20.0, spatial_radius_km=0.2, leaving_time=True)
# print the detected stops
print(stdf)
>>> print('Points of the original trajectory:\t%s'%len(tdf))
>>> print('Points of stops:\t\t\t%s'%len(stdf))
## A new column leaving_datetime is added to the TrajDataFrame
## in order to indicate the time when the user left the stop location.
## We can then visualize the detected stops using the plot_stops function:
m = stdf.plot_trajectory(max_users=1, start_end_markers=False)
stdf.plot_stops(max_users=1, map_f=m)
tdf.head()
# ******************************** Trajectory Compression ***************************************
# The goal of trajectory compression is to reduce the number of trajectory points while
# preserving the structure of the trajectory.
# For instance, to merge all the points that are closer than 0.2km from each other,
# we can use the following code:
from skmob.preprocessing import compression
# compress the trajectory using a spatial radius of 0.2 km
ctdf = compression.compress(tdf, spatial_radius_km=0.2)
# print the difference in points between original and filtered TrajDataFrame
print('Points of the original trajectory:\t%s'%len(tdf))
print('Points of the compressed trajectory:\t%s'%len(ctdf))
# Patterns of human mobility can be captured at individual and collective levels
# We can capture mobilit patterns of individual object or a group as a whole
# SCIKIT-MOBILITY: provides a wide set of mobility measures, each implemented as a function that takes in
# input a TrajDataFrame and outputs a pandas DataFrame
# Let's compute the radius of gyration, the jump lengths and the home locations of a TrajDataFrame
from skmob.measures.individual import jump_lengths, radius_of_gyration, home_location
# load a TrajDataFrame from an URL
url = "https://snap.stanford.edu/data/loc-brightkite_totalCheckins.txt.gz"
df = pd.read_csv(url, sep='\t', header=0, nrows=100000,
names=['user', 'check-in_time', 'latitude', 'longitude', 'location id'])
tdf = skmob.TrajDataFrame(df, latitude='latitude', longitude='longitude', datetime='check-in_time', user_id='user')
df.head(10)
df.tail(10)
tdf.head(10)
tdf.tail(10)
# compute the radius of gyration for each individual
rg_df = radius_of_gyration(tdf)
print(rg_df)
# compute the jump lengths for each individual
jl_df = jump_lengths(tdf.sort_values(by='datetime'))
print(jl_df.head())
# compute the home location for each individual
hl_df = home_location(tdf)
print(hl_df.head())
# now let's visualize a cloropleth map of the home locations
import folium
from folium.plugins import HeatMap
m = folium.Map(tiles = 'openstreetmap', zoom_start=12, control_scale=True)
HeatMap(hl_df[['lat', 'lng']].values).add_to(m)
m
## Collective generative models
# Collective generative models estimate spatial flows between a set of discrete locations.
# Examples of spatial flows estimated with collective generative models include
# 1. commuting trips between neighborhoods,
# 2. migration flows between municipalities,
# 3. freight shipments between states,
# 4. and phone calls between regions.
# Collective generative model takes in input a spatial tesselation and geopandas dataframe
# This spatial tesselation file should contain two columns: geometry and relevance
# These columns are used to compute two variables:
# 1. the distance between the tiles
# 2. the importance (aka, "attractiveness of each tile")
# A collective dataframe produces a Flow dataframe that contains generated flows and spatial tesselation
# The collective generative algorithms that we are going to use:
# 1. Gravity model
# 2. Radiation model
# The gravity model has two main methods
# 1. fit method: caliberates model parameters using a flow dataframe
# 2. generate method: which generates flows on given spatial tessellation
from skmob.utils import utils, constants
import geopandas as gpd
from skmob.models import Gravity
import numpy as np
# load a spatial tessellation
url_tess = 'https://raw.githubusercontent.com/scikit-mobility/scikit-mobility/master/tutorial/data/NY_counties_2011.geojson'
tessellation = gpd.read_file(url_tess).rename(columns={'tile_id': 'tile_ID'})
# download the file with the real fluxes from: https://raw.githubusercontent.com/scikit-mobility/scikit-mobility/master/tutorial/data/NY_commuting_flows_2011.csv
fdf = skmob.FlowDataFrame.from_file("C://Users//Snigdha.Cheekoty//Downloads//NY_commuting_flows_2011.csv", tessellation=tessellation, tile_id='tile_ID', sep=",")
# compute the total outflows from each location of the tessellation (excluding self loops)
tot_outflows = fdf[fdf['origin'] != fdf['destination']].groupby(by='origin', axis=0)['flow'].sum().fillna(0).values
tessellation[constants.TOT_OUTFLOW] = tot_outflows
# Instantiate a gravity model object and generate synthetic flows
# instantiate a singly constrained Gravity model
gravity_singly = Gravity(gravity_type='singly constrained')
print(gravity_singly)
# start the generation of the synthetic flows
np.random.seed(0)
synth_fdf = gravity_singly.generate(tessellation, tile_id_column='tile_ID', tot_outflows_column='tot_outflow', relevance_column= 'population', out_format='flows')
# print a portion of the synthetic flows
print(synth_fdf.head())
# instantiate a Gravity object (with default parameters)
gravity_singly_fitted = Gravity(gravity_type='singly constrained')
print(gravity_singly_fitted)
# fit the parameters of the Gravity from the FlowDataFrame
gravity_singly_fitted.fit(fdf, relevance_column='population')
print(gravity_singly_fitted)
# generate the synthetics flows
np.random.seed(0)
synth_fdf_fitted = gravity_singly_fitted.generate(tessellation, tile_id_column='tile_ID', tot_outflows_column='tot_outflow',
relevance_column= 'population', out_format='flows')
# print a portion of the synthetic flows
print(synth_fdf_fitted.head())
m = fdf.plot_flows(min_flow=100, flow_exp=0.01, flow_color='blue')
synth_fdf_fitted.plot_flows(min_flow=1000, flow_exp=0.01, map_f=m)
# The Radiation model is parameter-free and has only one method: generate.
# Given a spatial tessellation,
# the synthetic flows can be generated using the Radiation class as follows:
from skmob.models import Radiation
# instantiate a Radiation object
radiation = Radiation()
# start the simulation
np.random.seed(0)
rad_flows = radiation.generate(tessellation,tile_id_column='tile_ID',tot_outflows_column='tot_outflow',
relevance_column='population', out_format='flows_sample')
# print a portion of the synthetic flows
print(rad_flows.head())